
char clCode_clppScan_GPU[]=
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#define T uint\n"
"#define OPERATOR_INDEXOF(I) I\n"
"#define OPERATOR_APPLY(A,B) A+B\n"
"#define OPERATOR_IDENTITY 0\n"
"#define VOLATILE\n"
"inline T scan_simt_exclusive(__local VOLATILE T* input, size_t idx, const uint lane)\n"
"{\n"
"	if (lane > 0 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 1)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 1 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 2)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 3 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 4)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 7 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 8)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 15) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 16)], input[OPERATOR_INDEXOF(idx)]);\n"
"		\n"
"	return (lane > 0) ? input[idx-1] : OPERATOR_IDENTITY;\n"
"}\n"
"inline T scan_simt_inclusive(__local VOLATILE T* input, size_t idx, const uint lane)\n"
"{	\n"
"	if (lane > 0 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 1)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 1 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 2)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 3 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 4)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 7 ) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 8)] , input[OPERATOR_INDEXOF(idx)]);\n"
"	if (lane > 15) input[idx] = OPERATOR_APPLY(input[OPERATOR_INDEXOF(idx - 16)], input[OPERATOR_INDEXOF(idx)]);\n"
"		\n"
"	return input[idx];\n"
"}\n"
"inline T scan_workgroup_exclusive(__local T* localBuf, const uint idx, const uint lane, const uint simt_bid)\n"
"{\n"
"	// Step 1: Intra-warp scan in each warp\n"
"	T val = scan_simt_exclusive(localBuf, idx, lane);\n"
"	barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"	// Step 2: Collect per-warp partial results (the sum)\n"
"	if (lane > 30) localBuf[simt_bid] = localBuf[idx];\n"
"	barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"	// Step 3: Use 1st warp to scan per-warp results\n"
"	if (simt_bid < 1) scan_simt_inclusive(localBuf, idx, lane);\n"
"	barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"	// Step 4: Accumulate results from Steps 1 and 3\n"
"	if (simt_bid > 0) val = OPERATOR_APPLY(localBuf[simt_bid-1], val);\n"
"	barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"	// Step 5: Write and return the final result\n"
"	localBuf[idx] = val;\n"
"	barrier(CLK_LOCAL_MEM_FENCE);\n"
"	\n"
"	return val;\n"
"}\n"
"__kernel\n"
"void kernel__scan_block_anylength(\n"
"	__local T* localBuf,\n"
"	__global T* dataSet,\n"
"	const uint B,\n"
"	uint size,\n"
"	const uint passesCount\n"
")\n"
"{	\n"
"	size_t idx = get_local_id(0);\n"
"	const uint bidx = get_group_id(0);\n"
"	const uint TC = get_local_size(0);\n"
"	\n"
"	const uint lane = idx & 31;\n"
"	const uint simt_bid = idx >> 5;\n"
"	\n"
"	T reduceValue = OPERATOR_IDENTITY;\n"
"	\n"
"	//#pragma unroll 4\n"
"	for(uint i = 0; i < passesCount; ++i)\n"
"	{\n"
"		const uint offset = i * TC + (bidx * B);\n"
"		const uint offsetIdx = offset + idx;\n"
"		\n"
"#ifdef OCL_PLATFORM_AMD\n"
"		if (offsetIdx > size-1)\n"
"		{\n"
"			// To avoid to lock !\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			barrier(CLK_LOCAL_MEM_FENCE);\n"
"			continue;\n"
"		}\n"
"#else\n"
"		if (offsetIdx > size-1) return;\n"
"#endif\n"
"		// Step 1: Read TC elements from global (off-chip) memory to local memory (on-chip)\n"
"		T input = localBuf[idx] = dataSet[offsetIdx];		\n"
"		\n"
"		/*\n"
"		// This version try to avoid bank conflicts and improve memory access serializations !\n"
"		if (lane < 1)\n"
"		{\n"
"			__global T* currentOffset = inputDatas + offsetIdx;\n"
"			vstore16(vload16(0, currentOffset),  0, localBuf);\n"
"			vstore16(vload16(0, currentOffset + 16), 16, localBuf);\n"
"		}\n"
"		barrier(CLK_LOCAL_MEM_FENCE);\n"
"		T input = localBuf[idx];\n"
"		*/\n"
"		\n"
"		barrier(CLK_LOCAL_MEM_FENCE);\n"
"		\n"
"		// Step 2: Perform scan on TC elements\n"
"		T val = scan_workgroup_exclusive(localBuf, idx, lane, simt_bid);\n"
"		\n"
"		// Step 3: Propagate reduced result from previous block of TC elements\n"
"		val = OPERATOR_APPLY(val, reduceValue);\n"
"		\n"
"		// Step 4: Write out data to global memory\n"
"		dataSet[offsetIdx] = val;\n"
"		\n"
"		// Step 5: Choose reduced value for next iteration\n"
"		if (idx == (TC-1))\n"
"		{\n"
"			//localBuf[idx] = (Kind == exclusive) ? OPERATOR_APPLY(input, val) : val;\n"
"			localBuf[idx] = OPERATOR_APPLY(input, val);\n"
"		}\n"
"		barrier(CLK_LOCAL_MEM_FENCE);\n"
"		\n"
"		reduceValue = localBuf[TC-1];\n"
"		barrier(CLK_LOCAL_MEM_FENCE);\n"
"	}\n"
"}\n"
;
